Import data:

We need only 5 columns from the dataset. Some data exploration shows that the break points columns are NA for the datasets from 1990 and before. So, those .csv files can be ignored for our purposes.

matches = read_csv("data/atp/atp_matches_2000.csv", col_select = c(-winner_seed,-loser_seed))
## Rows: 3378 Columns: 47
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (14): tourney_id, tourney_name, surface, tourney_level, winner_entry, wi...
## dbl (33): draw_size, tourney_date, match_num, winner_id, winner_ht, winner_a...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## import only the columns you need
# matches = read_csv("data/atp/atp_matches_1991.csv", 
#                    col_types = cols_only(best_of = 'i', w_bpFaced = 'i', w_bpSaved = 'i', l_bpFaced = 'i', l_bpSaved = 'i'))

matches %>% glimpse
## Rows: 3,378
## Columns: 47
## $ tourney_id         <chr> "2000-301", "2000-301", "2000-301", "2000-301", "20…
## $ tourney_name       <chr> "Auckland", "Auckland", "Auckland", "Auckland", "Au…
## $ surface            <chr> "Hard", "Hard", "Hard", "Hard", "Hard", "Hard", "Ha…
## $ draw_size          <dbl> 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,…
## $ tourney_level      <chr> "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "…
## $ tourney_date       <dbl> 20000110, 20000110, 20000110, 20000110, 20000110, 2…
## $ match_num          <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, …
## $ winner_id          <dbl> 103163, 102607, 103252, 103507, 102103, 102021, 101…
## $ winner_entry       <chr> NA, "Q", NA, NA, "Q", NA, NA, NA, NA, "Q", NA, NA, …
## $ winner_name        <chr> "Tommy Haas", "Juan Balcells", "Alberto Martin", "J…
## $ winner_hand        <chr> "R", "R", "R", "R", "R", "R", "R", "R", "R", "R", "…
## $ winner_ht          <dbl> 188, 190, 175, 183, 180, 175, 185, 180, 193, 193, 1…
## $ winner_ioc         <chr> "GER", "ESP", "ESP", "ESP", "USA", "USA", "SWE", "S…
## $ winner_age         <dbl> 21.77139, 24.55852, 21.39083, 19.90965, 27.38125, 2…
## $ loser_id           <dbl> 101543, 102644, 102238, 103819, 102765, 101647, 103…
## $ loser_entry        <chr> NA, NA, NA, NA, NA, NA, "WC", "Q", NA, NA, NA, "WC"…
## $ loser_name         <chr> "Jeff Tarango", "Franco Squillari", "Alberto Berasa…
## $ loser_hand         <chr> "L", "L", "R", "R", "R", "R", "R", "R", "L", "R", "…
## $ loser_ht           <dbl> 180, 183, 173, 185, 185, 175, 185, 188, 193, 188, 1…
## $ loser_ioc          <chr> "USA", "ARG", "ESP", "SUI", "FRA", "ZIM", "NZL", "U…
## $ loser_age          <dbl> 31.13758, 24.38604, 26.53525, 18.42300, 23.77002, 3…
## $ score              <chr> "7-5 4-6 7-5", "7-5 7-5", "6-3 6-1", "6-4 6-4", "0-…
## $ best_of            <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, …
## $ round              <chr> "R32", "R32", "R32", "R32", "R32", "R32", "R32", "R…
## $ minutes            <dbl> 108, 85, 56, 68, 115, 140, 84, 77, 90, 57, 144, 96,…
## $ w_ace              <dbl> 18, 5, 0, 5, 1, 5, 4, 8, 5, 3, 1, 7, 3, 5, 16, 11, …
## $ w_df               <dbl> 4, 3, 0, 1, 2, 3, 8, 2, 3, 1, 9, 3, 3, 1, 5, 1, 0, …
## $ w_svpt             <dbl> 96, 76, 55, 53, 98, 101, 60, 69, 68, 41, 111, 65, 4…
## $ w_1stIn            <dbl> 49, 52, 35, 28, 66, 59, 31, 29, 43, 28, 65, 44, 30,…
## $ w_1stWon           <dbl> 39, 39, 25, 26, 39, 45, 28, 26, 30, 23, 45, 32, 23,…
## $ w_2ndWon           <dbl> 28, 13, 12, 15, 14, 19, 14, 20, 17, 10, 22, 12, 12,…
## $ w_SvGms            <dbl> 17, 12, 8, 10, 13, 17, 10, 10, 13, 8, 17, 11, 9, 8,…
## $ w_bpSaved          <dbl> 3, 5, 1, 0, 6, 1, 2, 1, 4, 2, 9, 2, 2, 3, 2, 0, 2, …
## $ w_bpFaced          <dbl> 5, 6, 1, 0, 11, 4, 3, 2, 7, 2, 13, 4, 3, 3, 4, 0, 2…
## $ l_ace              <dbl> 7, 5, 0, 11, 8, 3, 1, 7, 11, 0, 2, 8, 2, 0, 14, 4, …
## $ l_df               <dbl> 8, 10, 6, 2, 8, 3, 5, 3, 10, 3, 5, 4, 7, 5, 4, 0, 4…
## $ l_svpt             <dbl> 106, 74, 56, 70, 92, 105, 69, 64, 87, 63, 110, 100,…
## $ l_1stIn            <dbl> 55, 32, 33, 43, 46, 65, 45, 41, 43, 29, 55, 59, 32,…
## $ l_1stWon           <dbl> 39, 25, 20, 29, 34, 41, 28, 24, 33, 21, 38, 38, 20,…
## $ l_2ndWon           <dbl> 29, 18, 7, 14, 18, 21, 12, 13, 14, 11, 26, 17, 8, 1…
## $ l_SvGms            <dbl> 17, 12, 8, 10, 12, 17, 10, 9, 14, 8, 17, 11, 10, 8,…
## $ l_bpSaved          <dbl> 4, 3, 7, 6, 5, 2, 9, 3, 5, 5, 3, 14, 1, 7, 2, 2, 4,…
## $ l_bpFaced          <dbl> 7, 6, 11, 8, 9, 5, 12, 6, 11, 9, 8, 18, 5, 11, 5, 5…
## $ winner_rank        <dbl> 11, 211, 48, 45, 167, 50, 60, 43, 37, 115, 72, 38, …
## $ winner_rank_points <dbl> 1612, 157, 726, 768, 219, 722, 626, 785, 843, 344, …
## $ loser_rank         <dbl> 63, 49, 59, 61, 34, 70, 246, 334, 68, 95, 52, 76, 5…
## $ loser_rank_points  <dbl> 595, 723, 649, 616, 873, 563, 135, 88, 571, 416, 71…
#calculate total successful break points in match
matches = matches %>% mutate(
  total_breaks = (w_bpFaced-w_bpSaved) + (l_bpFaced-l_bpSaved)
)

matches %>% select(best_of, total_breaks) %>% 
  mutate(breaks_per_set = total_breaks/best_of) %>% 
  filter(best_of == 5) %>% 
  summarise(avg_breaks = mean(breaks_per_set, na.rm = TRUE))
## # A tibble: 1 × 1
##   avg_breaks
##        <dbl>
## 1       1.55

Break point data is available for ATP from 1991 onwards, and for WTA from 2003 onwards.

After dropping NA’s, we are left with ~1 million observations (ATP) and ~35k observations (WTA).

Help taken from: # see: https://stackoverflow.com/questions/5788117/only-read-selected-columns, # and: https://stackoverflow.com/questions/41355912/extracting-a-numeric-value-from-multiple-filename-in-r

# ----------- read in all ATP match csv files ----------- 
filenames = list.files(path = "./data/atp/", pattern = "atp_matches*", full.names = TRUE) 

combo_data_ATP = map_df(filenames,
                    ~read_csv(.x, 
                              col_types = cols_only(best_of = 'i', w_bpFaced = 'i', w_bpSaved = 'i', l_bpFaced = 'i', l_bpSaved = 'i', surface = "c"), 
                              show_col_types = FALSE) 
                    %>% mutate(year = as.integer(stri_extract_last(.x, regex = "(\\d+)")))
) %>%
  na.omit()

# ----------- calculate stats -----------
breaks_stats_ATP = combo_data_ATP %>% mutate(
  total_breaks = (w_bpFaced-w_bpSaved) + (l_bpFaced-l_bpSaved)) %>%
  mutate(breaks_per_set = total_breaks/best_of) %>% 
  group_by(year, surface) %>% 
  summarise(
    avg_breaks_surface = mean(breaks_per_set, na.rm = TRUE)
    )
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
breaks_per_year_ATP = breaks_stats_ATP %>% summarise(avg_breaks = mean(avg_breaks_surface))
# ----------- read in all WTA match csv files ----------- 
filenames = list.files(path = "./data/wta/", pattern = "wta_matches*", full.names = TRUE) 

combo_data_WTA = map_df(filenames,
                        ~read_csv(.x, 
                                  col_types = cols_only(best_of = 'i', w_bpFaced = 'i', w_bpSaved = 'i', l_bpFaced = 'i', l_bpSaved = 'i', surface = "c"), 
                                  show_col_types = FALSE) 
                        %>% mutate(year = as.integer(stri_extract_last(.x, regex = "(\\d+)")))
) %>%
  na.omit()


# ----------- calculate stats -----------
breaks_stats_WTA = combo_data_WTA %>% mutate(
  total_breaks = (w_bpFaced-w_bpSaved) + (l_bpFaced-l_bpSaved)) %>%
  mutate(breaks_per_set = total_breaks/best_of) %>% 
  group_by(year, surface) %>% 
  summarise(
    avg_breaks_surface = mean(breaks_per_set, na.rm = TRUE)
  )
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
breaks_per_year_WTA = breaks_stats_WTA %>% 
  summarise(avg_breaks = mean(avg_breaks_surface))

breaks_per_year_WTA
## # A tibble: 20 × 2
##    year avg_breaks
##   <int>      <dbl>
## 1  2003       2.46
## 2  2004       2.58
## 3  2005       2.53
## 4  2006       2.51
## 5  2007       2.40
## 6  2008       2.53
## # … with 14 more rows
# Plot 1
# -------  Combine breaks_per_year datasets for men and women, and plot ----
bind_rows(breaks_per_year_ATP, breaks_per_year_WTA, .id = "id") %>% 
  mutate(id=recode(id, '1'='ATP', '2'='WTA')) %>% 
  ggplot(aes(x=year, y=avg_breaks, colour=id)) +
  geom_point() +  
  geom_hline(yintercept=mean(breaks_per_year_ATP$avg_breaks), colour="red") +
  geom_hline(yintercept=mean(breaks_per_year_WTA$avg_breaks), colour="blue") +
  labs(
    title = "How common are service breaks? (1991-current)",
    y = "Breaks per set",
    x = "Time"
  )

# Plot 2
# ------- Combine combo_data_WTA datasets for men and women, and plot ------
# NB: This has additional `playing surface` information

bind_rows(breaks_stats_ATP, breaks_stats_WTA, .id = "id") %>% 
  mutate(id=recode(id, '1'='ATP', '2'='WTA')) %>% 
  ggplot(aes(x=year, y=avg_breaks_surface, colour=id)) +
  geom_point() +
  facet_wrap(vars(surface)) +
  labs(
    title = "How common are service breaks? (1991-current)",
    subtitle = "Dependence on playing surface",
    y = "Breaks per set",
    x = "Time"
  )

breaks_stats_WTA
## # A tibble: 68 × 3
## # Groups:   year [20]
##    year surface avg_breaks_surface
##   <int> <chr>                <dbl>
## 1  2003 Clay                  2.71
## 2  2003 Grass                 2.21
## 3  2003 Hard                  2.45
## 4  2004 Clay                  2.91
## 5  2004 Grass                 2.19
## 6  2004 Hard                  2.63
## # … with 62 more rows